InΒ [2]:
import pandas as pd
from helper import get_my_dataset
import seaborn as sns
import matplotlib.pyplot as plt
InΒ [3]:
"""
Get the data_set filtered for my borough
"""
main_df = get_my_dataset()
InΒ [4]:
"""
Checking the rows and columns of the dataset
"""
main_df.shape
Out[4]:
(2070, 32)
InΒ [5]:
main_df.dtypes
Out[5]:
Accident_Index object Location_Easting_OSGR int64 Location_Northing_OSGR int64 Longitude float64 Latitude float64 Police_Force int64 Accident_Severity int64 Number_of_Vehicles int64 Number_of_Casualties int64 Date object Day_of_Week int64 Time object Local_Authority_District int64 Local_Authority_Highway object 1st_Road_Class int64 1st_Road_Number int64 Road_Type int64 Speed_limit int64 Junction_Detail int64 Junction_Control int64 2nd_Road_Class int64 2nd_Road_Number int64 Pedestrian_Crossing-Human_Control int64 Pedestrian_Crossing-Physical_Facilities int64 Light_Conditions int64 Weather_Conditions int64 Road_Surface_Conditions int64 Special_Conditions_at_Site int64 Carriageway_Hazards int64 Urban_or_Rural_Area int64 Did_Police_Officer_Attend_Scene_of_Accident int64 LSOA_of_Accident_Location object dtype: object
InΒ [6]:
"""
Function to change data types according to manually updated data-types.csv.
"""
def process_dataframe(df, csv_path):
# Read the CSV file
instructions = pd.read_csv(csv_path)
# Iterate over the instructions
for index, row in instructions.iterrows():
col_name = row.iloc[0]
new_type = row.iloc[1]
# Change the data type of the column
if new_type == "numerical":
df[col_name] = pd.to_numeric(df[col_name], errors='coerce')
elif new_type == "date":
df[col_name] = pd.to_datetime(df[col_name], dayfirst=True, format='%d/%m/%Y', errors='coerce')
elif new_type == "time":
df[col_name] = pd.to_datetime(df[col_name], format='%H:%M').dt.time
elif new_type == "categorical":
df[col_name] = df[col_name].astype('str')
return df
# Example usage
processed_df = process_dataframe(main_df, 'data_types.csv')
processed_df.dtypes
Out[6]:
Accident_Index object Location_Easting_OSGR int64 Location_Northing_OSGR int64 Longitude float64 Latitude float64 Police_Force object Accident_Severity object Number_of_Vehicles int64 Number_of_Casualties int64 Date datetime64[ns] Day_of_Week object Time object Local_Authority_District object Local_Authority_Highway object 1st_Road_Class object 1st_Road_Number object Road_Type object Speed_limit object Junction_Detail object Junction_Control object 2nd_Road_Class object 2nd_Road_Number object Pedestrian_Crossing-Human_Control object Pedestrian_Crossing-Physical_Facilities object Light_Conditions object Weather_Conditions object Road_Surface_Conditions object Special_Conditions_at_Site object Carriageway_Hazards object Urban_or_Rural_Area object Did_Police_Officer_Attend_Scene_of_Accident object LSOA_of_Accident_Location object dtype: object
InΒ [7]:
"""
Checking missing values
"""
# Define all conditions for missing or invalid values
conditions = [
processed_df.isnull(), # NaN or None
processed_df.isna(), # Alias for isnull()
(processed_df == 'Nan'), # String 'Nan'
(processed_df == 'nan'), # String 'nan'
(processed_df == pd.NA), # Pandas NA
(processed_df == '-1'), # String '-1'
(processed_df == ''), # Empty string
(processed_df == 'NULL'), # String 'NULL'
(processed_df == None) # Python None
]
# Combine all conditions using logical OR
combined_mask = conditions[0]
for condition in conditions[1:]:
combined_mask |= condition
# Count missing values per column
missing_values = combined_mask.sum()
# Print the results
print(missing_values)
Accident_Index 0 Location_Easting_OSGR 0 Location_Northing_OSGR 0 Longitude 0 Latitude 0 Police_Force 0 Accident_Severity 0 Number_of_Vehicles 0 Number_of_Casualties 0 Date 0 Day_of_Week 0 Time 0 Local_Authority_District 0 Local_Authority_Highway 0 1st_Road_Class 0 1st_Road_Number 0 Road_Type 0 Speed_limit 0 Junction_Detail 0 Junction_Control 570 2nd_Road_Class 570 2nd_Road_Number 0 Pedestrian_Crossing-Human_Control 0 Pedestrian_Crossing-Physical_Facilities 0 Light_Conditions 0 Weather_Conditions 0 Road_Surface_Conditions 0 Special_Conditions_at_Site 0 Carriageway_Hazards 0 Urban_or_Rural_Area 0 Did_Police_Officer_Attend_Scene_of_Accident 0 LSOA_of_Accident_Location 4 dtype: int64
InΒ [8]:
# Create a heatmap to visualize the missing values
plt.figure(figsize=(12, 8))
sns.heatmap(combined_mask, cbar=False, cmap='viridis')
plt.title('Missing Values Heatmap')
plt.show()
InΒ [9]:
#Outliers
print(processed_df.shape)
# Select numeric variables
my_data_numeric = processed_df.select_dtypes(include='number')
print('The dataset has', len(my_data_numeric.columns),'Numeric variables:')
print(my_data_numeric)
# Select categorical variables
my_data_categorical = processed_df.select_dtypes(include='object')
print('The dataset has', len(my_data_categorical.columns),'Categorical variables:')
print(my_data_categorical)
(2070, 32)
The dataset has 6 Numeric variables:
Location_Easting_OSGR Location_Northing_OSGR Longitude Latitude \
0 521760 169840 -0.250492 51.414457
1 518140 165370 -0.304009 51.375047
2 519130 163980 -0.290258 51.362347
3 520840 167190 -0.264617 51.390837
4 521280 172110 -0.256614 51.434962
... ... ... ... ...
2065 517330 161850 -0.316802 51.343577
2066 517870 169360 -0.306563 51.410965
2067 519370 169000 -0.285126 51.407416
2068 519330 169110 -0.285664 51.408413
2069 521830 169870 -0.249475 51.414712
Number_of_Vehicles Number_of_Casualties
0 1 1
1 3 1
2 2 2
3 3 1
4 2 1
... ... ...
2065 2 1
2066 2 1
2067 2 1
2068 1 1
2069 2 1
[2070 rows x 6 columns]
The dataset has 25 Categorical variables:
Accident_Index Police_Force Accident_Severity Day_of_Week Time \
0 200501TE00016 1 3 5 07:25:00
1 200501TE00017 1 3 3 08:35:00
2 200501TE00026 1 2 1 01:17:00
3 200501TE00039 1 3 1 18:00:00
4 200501TE00060 1 2 6 06:20:00
... ... ... ... ... ...
2065 201001VK30404 1 3 3 14:45:00
2066 201001VK30408 1 3 4 16:10:00
2067 201001VK30420 1 3 2 08:00:00
2068 201001VK39057 1 2 2 18:45:00
2069 201001VW40202 1 3 5 18:00:00
Local_Authority_District Local_Authority_Highway 1st_Road_Class \
0 23 E09000021 3
1 23 E09000021 3
2 23 E09000021 5
3 23 E09000021 3
4 23 E09000021 3
... ... ... ...
2065 23 E09000021 3
2066 23 E09000021 6
2067 23 E09000021 6
2068 23 E09000021 3
2069 23 E09000021 3
1st_Road_Number Road_Type ... Pedestrian_Crossing-Human_Control \
0 3 6 ... 0
1 3 3 ... 0
2 0 3 ... 0
3 3 3 ... 0
4 3 3 ... 0
... ... ... ... ...
2065 243 3 ... 0
2066 0 6 ... 0
2067 0 6 ... 0
2068 2043 6 ... 0
2069 238 6 ... 0
Pedestrian_Crossing-Physical_Facilities Light_Conditions \
0 0 4
1 0 1
2 0 4
3 0 4
4 0 4
... ... ...
2065 0 1
2066 5 4
2067 0 1
2068 4 1
2069 0 1
Weather_Conditions Road_Surface_Conditions Special_Conditions_at_Site \
0 1 2 0
1 1 2 0
2 1 1 0
3 1 1 0
4 1 1 0
... ... ... ...
2065 1 1 0
2066 1 1 0
2067 2 2 0
2068 1 1 0
2069 1 1 0
Carriageway_Hazards Urban_or_Rural_Area \
0 0 1
1 0 1
2 0 1
3 0 1
4 0 1
... ... ...
2065 0 2
2066 0 1
2067 0 1
2068 6 1
2069 0 1
Did_Police_Officer_Attend_Scene_of_Accident LSOA_of_Accident_Location
0 1 E01002953
1 1 E01003002
2 1 E01002946
3 1 E01002981
4 1 E01002955
... ... ...
2065 2 E01002948
2066 1 E01002968
2067 2 E01002969
2068 1 E01002970
2069 1 E01002953
[2070 rows x 25 columns]
InΒ [10]:
# Outliers - For numerics
fig = plt.figure(figsize= (15,30))
for i in range(len(my_data_numeric.columns)):
column = my_data_numeric.columns[i]
sub = fig.add_subplot(8,3,i+1)
chart = sns.boxplot(data=my_data_numeric, y=column, color='g')
InΒ [11]:
# Use histogram for all the numeical variables
fig = plt.figure(figsize= (15,30))
for i in range(len(my_data_numeric.columns)):
column = my_data_numeric.columns[i]
sub = fig.add_subplot(8,3, i+1)
chart = sns.histplot(data=my_data_numeric, x=column, bins=50, color='g', kde=False)
InΒ [12]:
# Outliers - for categorical values
# Number of categorical variables
num_categorical = len(my_data_categorical.columns)
# Create a figure with enough rows
fig = plt.figure(figsize=(15, num_categorical * 5)) # Adjust height dynamically
for i, column in enumerate(my_data_categorical.columns):
sub = fig.add_subplot(num_categorical, 1, i+1) # Use the total number of plots
sns.countplot(data=processed_df, x=column, color='y', ax=sub) # Use ax=sub for subplot integration
sub.set_title(f"Count Plot for {column}", fontsize=12)
plt.xticks(rotation=45) # Rotate x-axis labels if necessary for clarity
plt.tight_layout() # Adjust layout to prevent overlap
plt.show()
InΒ [13]:
#Tackle data quality issues
# Drop duplicates
processed_df = processed_df[processed_df.columns[~processed_df.columns.isin(['Location_Easting_OSGR', 'Location_Northing_OSGR'])]] # Drop two variables TOWN_No and CMEDV
processed_df
Out[13]:
| Accident_Index | Longitude | Latitude | Police_Force | Accident_Severity | Number_of_Vehicles | Number_of_Casualties | Date | Day_of_Week | Time | ... | Pedestrian_Crossing-Human_Control | Pedestrian_Crossing-Physical_Facilities | Light_Conditions | Weather_Conditions | Road_Surface_Conditions | Special_Conditions_at_Site | Carriageway_Hazards | Urban_or_Rural_Area | Did_Police_Officer_Attend_Scene_of_Accident | LSOA_of_Accident_Location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 200501TE00016 | -0.250492 | 51.414457 | 1 | 3 | 1 | 1 | 2005-01-13 | 5 | 07:25:00 | ... | 0 | 0 | 4 | 1 | 2 | 0 | 0 | 1 | 1 | E01002953 |
| 1 | 200501TE00017 | -0.304009 | 51.375047 | 1 | 3 | 3 | 1 | 2005-01-18 | 3 | 08:35:00 | ... | 0 | 0 | 1 | 1 | 2 | 0 | 0 | 1 | 1 | E01003002 |
| 2 | 200501TE00026 | -0.290258 | 51.362347 | 1 | 2 | 2 | 2 | 2005-01-09 | 1 | 01:17:00 | ... | 0 | 0 | 4 | 1 | 1 | 0 | 0 | 1 | 1 | E01002946 |
| 3 | 200501TE00039 | -0.264617 | 51.390837 | 1 | 3 | 3 | 1 | 2005-01-23 | 1 | 18:00:00 | ... | 0 | 0 | 4 | 1 | 1 | 0 | 0 | 1 | 1 | E01002981 |
| 4 | 200501TE00060 | -0.256614 | 51.434962 | 1 | 2 | 2 | 1 | 2005-01-21 | 6 | 06:20:00 | ... | 0 | 0 | 4 | 1 | 1 | 0 | 0 | 1 | 1 | E01002955 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2065 | 201001VK30404 | -0.316802 | 51.343577 | 1 | 3 | 2 | 1 | 2010-12-14 | 3 | 14:45:00 | ... | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 2 | 2 | E01002948 |
| 2066 | 201001VK30408 | -0.306563 | 51.410965 | 1 | 3 | 2 | 1 | 2010-12-22 | 4 | 16:10:00 | ... | 0 | 5 | 4 | 1 | 1 | 0 | 0 | 1 | 1 | E01002968 |
| 2067 | 201001VK30420 | -0.285126 | 51.407416 | 1 | 3 | 2 | 1 | 2010-11-08 | 2 | 08:00:00 | ... | 0 | 0 | 1 | 2 | 2 | 0 | 0 | 1 | 2 | E01002969 |
| 2068 | 201001VK39057 | -0.285664 | 51.408413 | 1 | 2 | 1 | 1 | 2010-05-17 | 2 | 18:45:00 | ... | 0 | 4 | 1 | 1 | 1 | 0 | 6 | 1 | 1 | E01002970 |
| 2069 | 201001VW40202 | -0.249475 | 51.414712 | 1 | 3 | 2 | 1 | 2010-06-17 | 5 | 18:00:00 | ... | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 1 | 1 | E01002953 |
2070 rows Γ 30 columns
InΒ [14]:
my_data_no_missing = processed_df.dropna()
my_data_no_missing.shape
Out[14]:
(2070, 30)
InΒ [15]:
numerical_cols = ['Number_of_Casualties', 'Longitude', 'Latitude']
# Remove outliers using IQR
outliers_index = []
for col in numerical_cols:
Q1 = my_data_no_missing[col].quantile(0.25)
Q3 = my_data_no_missing[col].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers_index += my_data_no_missing.index[(my_data_no_missing[col] < lower_bound) | (my_data_no_missing[col] > upper_bound)].tolist()
# Drop rows with outliers
outliers_index = set(outliers_index)
my_data_cleaned = my_data_no_missing.drop(index=outliers_index)
my_data_cleaned.shape
Out[15]:
(1705, 30)
InΒ [16]:
#analysis without outlier removing
light_conditions_count = my_data_cleaned['Light_Conditions'].value_counts(normalize=True)
print("Proportion of Light Conditions:")
print(light_conditions_count)
# Example: Analyzing Weather Conditions
weather_conditions_count = my_data_cleaned['Weather_Conditions'].value_counts(normalize=True)
print("Proportion of Weather Conditions:")
print(weather_conditions_count)
Proportion of Light Conditions: Light_Conditions 1 0.777713 4 0.221114 7 0.000587 6 0.000587 Name: proportion, dtype: float64 Proportion of Weather Conditions: Weather_Conditions 1 0.889736 2 0.080938 8 0.015249 4 0.006452 5 0.002933 3 0.002346 7 0.001760 9 0.000587 Name: proportion, dtype: float64
InΒ [17]:
# Plot for Light_Conditions
plt.figure(figsize=(8, 5))
light_conditions_count.plot(kind='bar', color='skyblue', alpha=0.8)
plt.title('Distribution of Light Conditions')
plt.xlabel('Light Conditions')
plt.ylabel('Proportion')
plt.xticks(rotation=0)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()
# Plot for Weather_Conditions
plt.figure(figsize=(8, 5))
weather_conditions_count.plot(kind='bar', color='orange', alpha=0.8)
plt.title('Distribution of Weather Conditions')
plt.xlabel('Weather Conditions')
plt.ylabel('Proportion')
plt.xticks(rotation=0)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()
InΒ [18]:
# Filter rows for serious and fatal accidents
serious_accidents = my_data_cleaned[my_data_cleaned['Accident_Severity'].isin(['Serious', 'Fatal'])]
# Analyze Weather Conditions for serious accidents
serious_weather = serious_accidents['Weather_Conditions'].value_counts(normalize=True)
print("Weather Conditions for Serious/Fatal Accidents:")
print(serious_weather)
# Analyze Road Surface Conditions for serious accidents
serious_road_surface = serious_accidents['Road_Surface_Conditions'].value_counts(normalize=True)
print("Road Surface Conditions for Serious/Fatal Accidents:")
print(serious_road_surface)
Weather Conditions for Serious/Fatal Accidents: Series([], Name: proportion, dtype: float64) Road Surface Conditions for Serious/Fatal Accidents: Series([], Name: proportion, dtype: float64)
InΒ [19]:
# Plot accident hotspots
plt.figure(figsize=(10, 6))
plt.scatter(my_data_cleaned['Longitude'], my_data_cleaned['Latitude'], alpha=0.5, s=10)
plt.title("Accident Hotspots")
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.show()
InΒ [20]:
import folium
# Create a base map centered around the mean location
center_lat = my_data_cleaned['Latitude'].mean()
center_lon = my_data_cleaned['Longitude'].mean()
map_hotspots = folium.Map(location=[center_lat, center_lon], zoom_start=13)
# Add individual markers for each accident location
for _, row in my_data_cleaned.iterrows():
folium.CircleMarker(
location=[row['Latitude'], row['Longitude']],
radius=2, # Radius of the marker
color='blue', # Border color of the circle
fill=True,
fill_color='blue', # Fill color of the circle
fill_opacity=0.6 # Opacity of the marker
).add_to(map_hotspots)
# Display the map
map_hotspots
Out[20]:
Make this Notebook Trusted to load map: File -> Trust Notebook
InΒ [21]:
import folium
from folium.plugins import MarkerCluster
# Create a base map centered around the mean latitude and longitude
center_lat = my_data_cleaned['Latitude'].mean()
center_lon = my_data_cleaned['Longitude'].mean()
map_hotspots = folium.Map(location=[center_lat, center_lon], zoom_start=10)
# Create a marker cluster to group nearby accidents
marker_cluster = MarkerCluster().add_to(map_hotspots)
# Add markers for each accident location
for _, row in my_data_cleaned.iterrows():
folium.Marker(location=[row['Latitude'], row['Longitude']]).add_to(marker_cluster)
# Display the map
map_hotspots
Out[21]:
Make this Notebook Trusted to load map: File -> Trust Notebook
InΒ [22]:
print(my_data_cleaned.shape)
fig = plt.figure(figsize=(15, 30))
# Filter numeric columns
numeric_cols = my_data_cleaned.select_dtypes(include='number').columns
for i, column in enumerate(numeric_cols):
sub = fig.add_subplot(8, 3, i + 1)
chart = sns.histplot(data=my_data_cleaned, x=column, bins=50, color='g', kde=False)
plt.tight_layout()
plt.show()
(1705, 30)
InΒ [23]:
# Plot Boxplot after IQR applied
fig = plt.figure(figsize=(15, 30))
my_data = my_data_cleaned.select_dtypes(include='number')
for i in range(len(my_data.columns)):
column = my_data.columns[i]
sub = fig.add_subplot(8, 3, i + 1)
chart = sns.boxplot(data=my_data, y=column, color='y')
InΒ [24]:
# List of categorical columns
categorical_cols = [
'Police_Force', 'Accident_Severity', 'Day_of_Week', 'Local_Authority_District',
'Local_Authority_Highway', '1st_Road_Class', 'Road_Type', 'Speed_limit',
'Junction_Detail', 'Junction_Control', '2nd_Road_Class',
'Pedestrian_Crossing-Human_Control', 'Pedestrian_Crossing-Physical_Facilities',
'Light_Conditions', 'Weather_Conditions', 'Road_Surface_Conditions',
'Special_Conditions_at_Site', 'Carriageway_Hazards', 'Urban_or_Rural_Area',
'Did_Police_Officer_Attend_Scene_of_Accident', 'LSOA_of_Accident_Location'
]
# Analyze and visualize distributions
for col in categorical_cols:
print(f"Class distribution for {col}:")
print(my_data_cleaned[col].value_counts(normalize=True) * 100) # Print proportions
print()
# Plot class distribution
plt.figure(figsize=(8, 4))
my_data_cleaned[col].value_counts(normalize=True).plot(
kind='bar', color='skyblue', alpha=0.8
)
plt.title(f"Class Distribution for {col}")
plt.xlabel("Class")
plt.ylabel("Proportion (%)")
plt.xticks(rotation=45)
plt.show()
Class distribution for Police_Force: Police_Force 1 100.0 Name: proportion, dtype: float64
Class distribution for Accident_Severity: Accident_Severity 3 85.161290 2 14.310850 1 0.527859 Name: proportion, dtype: float64
Class distribution for Day_of_Week: Day_of_Week 6 16.304985 4 16.011730 2 15.835777 5 15.601173 3 15.307918 7 12.023460 1 8.914956 Name: proportion, dtype: float64
Class distribution for Local_Authority_District: Local_Authority_District 23 100.0 Name: proportion, dtype: float64
Class distribution for Local_Authority_Highway: Local_Authority_Highway E09000021 100.0 Name: proportion, dtype: float64
Class distribution for 1st_Road_Class: 1st_Road_Class 3 65.337243 6 15.835777 5 11.495601 4 7.331378 Name: proportion, dtype: float64
Class distribution for Road_Type: Road_Type 6 72.316716 3 16.363636 1 6.334311 2 3.167155 7 1.759531 9 0.058651 Name: proportion, dtype: float64
Class distribution for Speed_limit: Speed_limit 30 93.782991 50 4.340176 40 1.231672 20 0.645161 Name: proportion, dtype: float64
Class distribution for Junction_Detail: Junction_Detail 3 44.574780 0 27.096774 6 10.146628 1 8.328446 8 4.164223 7 2.346041 5 1.759531 2 1.466276 9 0.117302 Name: proportion, dtype: float64
Class distribution for Junction_Control: Junction_Control 4 58.299120 -1 27.096774 2 14.545455 1 0.058651 Name: proportion, dtype: float64
Class distribution for 2nd_Road_Class: 2nd_Road_Class 6 45.102639 -1 27.096774 3 14.838710 5 7.683284 4 5.278592 Name: proportion, dtype: float64
Class distribution for Pedestrian_Crossing-Human_Control: Pedestrian_Crossing-Human_Control 0 99.882698 2 0.117302 Name: proportion, dtype: float64
Class distribution for Pedestrian_Crossing-Physical_Facilities: Pedestrian_Crossing-Physical_Facilities 0 77.184751 5 9.208211 1 7.448680 4 5.630499 8 0.410557 7 0.117302 Name: proportion, dtype: float64
Class distribution for Light_Conditions: Light_Conditions 1 77.771261 4 22.111437 7 0.058651 6 0.058651 Name: proportion, dtype: float64
Class distribution for Weather_Conditions: Weather_Conditions 1 88.973607 2 8.093842 8 1.524927 4 0.645161 5 0.293255 3 0.234604 7 0.175953 9 0.058651 Name: proportion, dtype: float64
Class distribution for Road_Surface_Conditions: Road_Surface_Conditions 1 83.284457 2 15.542522 4 0.938416 3 0.234604 Name: proportion, dtype: float64
Class distribution for Special_Conditions_at_Site: Special_Conditions_at_Site 0 98.592375 4 0.703812 6 0.469208 5 0.058651 7 0.058651 1 0.058651 3 0.058651 Name: proportion, dtype: float64
Class distribution for Carriageway_Hazards: Carriageway_Hazards 0 98.944282 7 0.410557 2 0.293255 1 0.175953 6 0.117302 3 0.058651 Name: proportion, dtype: float64
Class distribution for Urban_or_Rural_Area: Urban_or_Rural_Area 1 97.302053 2 2.697947 Name: proportion, dtype: float64
Class distribution for Did_Police_Officer_Attend_Scene_of_Accident: Did_Police_Officer_Attend_Scene_of_Accident 1 81.935484 2 17.302053 3 0.762463 Name: proportion, dtype: float64
Class distribution for LSOA_of_Accident_Location:
LSOA_of_Accident_Location
E01002968 11.436950
E01002984 3.460411
E01002948 3.343109
E01003003 2.932551
E01003004 2.639296
...
E01030388 0.058651
E01003445 0.058651
E01002937 0.058651
E01003817 0.058651
E01002923 0.058651
Name: proportion, Length: 100, dtype: float64
InΒ [25]:
#Explore basic statistics of each attribute
categorical_cols = [
'Police_Force', 'Accident_Severity', 'Day_of_Week', 'Local_Authority_District',
'Local_Authority_Highway', '1st_Road_Class', 'Road_Type', 'Speed_limit',
'Junction_Detail', 'Junction_Control', '2nd_Road_Class',
'Pedestrian_Crossing-Human_Control', 'Pedestrian_Crossing-Physical_Facilities',
'Light_Conditions', 'Weather_Conditions', 'Road_Surface_Conditions',
'Special_Conditions_at_Site', 'Carriageway_Hazards', 'Urban_or_Rural_Area',
'Did_Police_Officer_Attend_Scene_of_Accident', 'LSOA_of_Accident_Location'
]
for col in categorical_cols:
print(f"Statistics for {col}:\n")
# Frequency counts and proportions
counts = my_data_cleaned[col].value_counts()
proportions = my_data_cleaned[col].value_counts(normalize=True)
# Mode (most frequent value)
mode = my_data_cleaned[col].mode()[0]
print(f"Counts:\n{counts}")
print(f"Proportions (%):\n{proportions * 100}")
print(f"Mode: {mode}\n")
# Visualization: Bar plot for class distributions
plt.figure(figsize=(8, 4))
counts.plot(kind='bar', color='skyblue', alpha=0.8)
plt.title(f"Class Distribution for {col}")
plt.xlabel("Class")
plt.ylabel("Frequency")
plt.xticks(rotation=45)
plt.show()
Statistics for Police_Force: Counts: Police_Force 1 1705 Name: count, dtype: int64 Proportions (%): Police_Force 1 100.0 Name: proportion, dtype: float64 Mode: 1
Statistics for Accident_Severity: Counts: Accident_Severity 3 1452 2 244 1 9 Name: count, dtype: int64 Proportions (%): Accident_Severity 3 85.161290 2 14.310850 1 0.527859 Name: proportion, dtype: float64 Mode: 3
Statistics for Day_of_Week: Counts: Day_of_Week 6 278 4 273 2 270 5 266 3 261 7 205 1 152 Name: count, dtype: int64 Proportions (%): Day_of_Week 6 16.304985 4 16.011730 2 15.835777 5 15.601173 3 15.307918 7 12.023460 1 8.914956 Name: proportion, dtype: float64 Mode: 6
Statistics for Local_Authority_District: Counts: Local_Authority_District 23 1705 Name: count, dtype: int64 Proportions (%): Local_Authority_District 23 100.0 Name: proportion, dtype: float64 Mode: 23
Statistics for Local_Authority_Highway: Counts: Local_Authority_Highway E09000021 1705 Name: count, dtype: int64 Proportions (%): Local_Authority_Highway E09000021 100.0 Name: proportion, dtype: float64 Mode: E09000021
Statistics for 1st_Road_Class: Counts: 1st_Road_Class 3 1114 6 270 5 196 4 125 Name: count, dtype: int64 Proportions (%): 1st_Road_Class 3 65.337243 6 15.835777 5 11.495601 4 7.331378 Name: proportion, dtype: float64 Mode: 3
Statistics for Road_Type: Counts: Road_Type 6 1233 3 279 1 108 2 54 7 30 9 1 Name: count, dtype: int64 Proportions (%): Road_Type 6 72.316716 3 16.363636 1 6.334311 2 3.167155 7 1.759531 9 0.058651 Name: proportion, dtype: float64 Mode: 6
Statistics for Speed_limit: Counts: Speed_limit 30 1599 50 74 40 21 20 11 Name: count, dtype: int64 Proportions (%): Speed_limit 30 93.782991 50 4.340176 40 1.231672 20 0.645161 Name: proportion, dtype: float64 Mode: 30
Statistics for Junction_Detail: Counts: Junction_Detail 3 760 0 462 6 173 1 142 8 71 7 40 5 30 2 25 9 2 Name: count, dtype: int64 Proportions (%): Junction_Detail 3 44.574780 0 27.096774 6 10.146628 1 8.328446 8 4.164223 7 2.346041 5 1.759531 2 1.466276 9 0.117302 Name: proportion, dtype: float64 Mode: 3
Statistics for Junction_Control: Counts: Junction_Control 4 994 -1 462 2 248 1 1 Name: count, dtype: int64 Proportions (%): Junction_Control 4 58.299120 -1 27.096774 2 14.545455 1 0.058651 Name: proportion, dtype: float64 Mode: 4
Statistics for 2nd_Road_Class: Counts: 2nd_Road_Class 6 769 -1 462 3 253 5 131 4 90 Name: count, dtype: int64 Proportions (%): 2nd_Road_Class 6 45.102639 -1 27.096774 3 14.838710 5 7.683284 4 5.278592 Name: proportion, dtype: float64 Mode: 6
Statistics for Pedestrian_Crossing-Human_Control: Counts: Pedestrian_Crossing-Human_Control 0 1703 2 2 Name: count, dtype: int64 Proportions (%): Pedestrian_Crossing-Human_Control 0 99.882698 2 0.117302 Name: proportion, dtype: float64 Mode: 0
Statistics for Pedestrian_Crossing-Physical_Facilities: Counts: Pedestrian_Crossing-Physical_Facilities 0 1316 5 157 1 127 4 96 8 7 7 2 Name: count, dtype: int64 Proportions (%): Pedestrian_Crossing-Physical_Facilities 0 77.184751 5 9.208211 1 7.448680 4 5.630499 8 0.410557 7 0.117302 Name: proportion, dtype: float64 Mode: 0
Statistics for Light_Conditions: Counts: Light_Conditions 1 1326 4 377 7 1 6 1 Name: count, dtype: int64 Proportions (%): Light_Conditions 1 77.771261 4 22.111437 7 0.058651 6 0.058651 Name: proportion, dtype: float64 Mode: 1
Statistics for Weather_Conditions: Counts: Weather_Conditions 1 1517 2 138 8 26 4 11 5 5 3 4 7 3 9 1 Name: count, dtype: int64 Proportions (%): Weather_Conditions 1 88.973607 2 8.093842 8 1.524927 4 0.645161 5 0.293255 3 0.234604 7 0.175953 9 0.058651 Name: proportion, dtype: float64 Mode: 1
Statistics for Road_Surface_Conditions: Counts: Road_Surface_Conditions 1 1420 2 265 4 16 3 4 Name: count, dtype: int64 Proportions (%): Road_Surface_Conditions 1 83.284457 2 15.542522 4 0.938416 3 0.234604 Name: proportion, dtype: float64 Mode: 1
Statistics for Special_Conditions_at_Site: Counts: Special_Conditions_at_Site 0 1681 4 12 6 8 5 1 7 1 1 1 3 1 Name: count, dtype: int64 Proportions (%): Special_Conditions_at_Site 0 98.592375 4 0.703812 6 0.469208 5 0.058651 7 0.058651 1 0.058651 3 0.058651 Name: proportion, dtype: float64 Mode: 0
Statistics for Carriageway_Hazards: Counts: Carriageway_Hazards 0 1687 7 7 2 5 1 3 6 2 3 1 Name: count, dtype: int64 Proportions (%): Carriageway_Hazards 0 98.944282 7 0.410557 2 0.293255 1 0.175953 6 0.117302 3 0.058651 Name: proportion, dtype: float64 Mode: 0
Statistics for Urban_or_Rural_Area: Counts: Urban_or_Rural_Area 1 1659 2 46 Name: count, dtype: int64 Proportions (%): Urban_or_Rural_Area 1 97.302053 2 2.697947 Name: proportion, dtype: float64 Mode: 1
Statistics for Did_Police_Officer_Attend_Scene_of_Accident: Counts: Did_Police_Officer_Attend_Scene_of_Accident 1 1397 2 295 3 13 Name: count, dtype: int64 Proportions (%): Did_Police_Officer_Attend_Scene_of_Accident 1 81.935484 2 17.302053 3 0.762463 Name: proportion, dtype: float64 Mode: 1
Statistics for LSOA_of_Accident_Location:
Counts:
LSOA_of_Accident_Location
E01002968 195
E01002984 59
E01002948 57
E01003003 50
E01003004 45
...
E01030388 1
E01003445 1
E01002937 1
E01003817 1
E01002923 1
Name: count, Length: 100, dtype: int64
Proportions (%):
LSOA_of_Accident_Location
E01002968 11.436950
E01002984 3.460411
E01002948 3.343109
E01003003 2.932551
E01003004 2.639296
...
E01030388 0.058651
E01003445 0.058651
E01002937 0.058651
E01003817 0.058651
E01002923 0.058651
Name: proportion, Length: 100, dtype: float64
Mode: E01002968
InΒ [26]:
from scipy.stats import skew, kurtosis
# Select numerical columns from my_data_cleaned
numerical_cols = my_data_cleaned.select_dtypes(include='number').columns
for col in numerical_cols:
print(f"Statistics for {col}:\n")
# Calculate statistics
value_range = (my_data_cleaned[col].min(), my_data_cleaned[col].max())
avg = my_data_cleaned[col].mean()
std_dev = my_data_cleaned[col].std()
skewness = skew(my_data_cleaned[col])
kurt = kurtosis(my_data_cleaned[col])
mode = my_data_cleaned[col].mode()[0]
print(f"Range: {value_range}")
print(f"Mean: {avg}")
print(f"Standard Deviation: {std_dev}")
print(f"Skewness: {skewness}")
print(f"Kurtosis: {kurt}")
print(f"Mode: {mode}\n")
# Visualization: Histogram
plt.figure(figsize=(8, 4))
sns.histplot(my_data_cleaned[col], bins=30, color='skyblue', kde=True)
plt.title(f"Distribution of {col}")
plt.xlabel(col)
plt.ylabel("Frequency")
plt.show()
Statistics for Longitude: Range: (np.float64(-0.322592), np.float64(-0.240029)) Mean: -0.28657742873900294 Standard Deviation: 0.019574295133454777 Skewness: 0.5346755208852578 Kurtosis: -0.9840962485637044 Mode: -0.318182
Statistics for Latitude: Range: (np.float64(51.340366), np.float64(51.436819)) Mean: 51.396396457478005 Standard Deviation: 0.019169678043049053 Skewness: -0.5224307522997643 Kurtosis: 0.10574098794478148 Mode: 51.341169
Statistics for Number_of_Vehicles: Range: (np.int64(1), np.int64(8)) Mean: 1.790615835777126 Standard Deviation: 0.6215278451588202 Skewness: 1.2766063409035726 Kurtosis: 9.268785771091464 Mode: 2
Statistics for Number_of_Casualties: Range: (np.int64(1), np.int64(1)) Mean: 1.0 Standard Deviation: 0.0 Skewness: nan Kurtosis: nan Mode: 1
C:\Users\USER\AppData\Local\Temp\ipykernel_23516\1517211522.py:13: RuntimeWarning: Precision loss occurred in moment calculation due to catastrophic cancellation. This occurs when the data are nearly identical. Results may be unreliable. skewness = skew(my_data_cleaned[col]) C:\Users\USER\AppData\Local\Temp\ipykernel_23516\1517211522.py:14: RuntimeWarning: Precision loss occurred in moment calculation due to catastrophic cancellation. This occurs when the data are nearly identical. Results may be unreliable. kurt = kurtosis(my_data_cleaned[col])